#Load libraries



library(TCGAWorkflowData)
library(TCGAbiolinks)
package ‘TCGAbiolinks’ was built under R version 4.1.2Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
 =============================================================
 ______  ___  ____   ___                                        
   ||   |    |      |   | |    o  __  |   o  _         __         
   ||   |    | ___  |___| |__  | |  | |   | | | | |_/ |__         
   ||   |___ |____| |   | |__| | |__| |__ | | |_| | \  __|       
 ------------------------------------------------------------
 Query, download & analyze - GDC                  
 Version:2.22.4
 ==============================================================
library(SummarizedExperiment)
Loading required package: MatrixGenerics
Loading required package: matrixStats
package ‘matrixStats’ was built under R version 4.1.2
Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins,
    colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads,
    colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges,
    colRanks, colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs,
    rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs,
    rowMeans2, rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads,
    rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars

Loading required package: GenomicRanges
package ‘GenomicRanges’ was built under R version 4.1.2Loading required package: stats4
Loading required package: BiocGenerics

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
    eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
    rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors
package ‘S4Vectors’ was built under R version 4.1.3
Attaching package: ‘S4Vectors’

The following objects are masked from ‘package:base’:

    expand.grid, I, unname

Loading required package: IRanges
Loading required package: GenomeInfoDb
package ‘GenomeInfoDb’ was built under R version 4.1.2Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor,
    see 'citation("Biobase")', and for packages 'citation("pkgname")'.


Attaching package: ‘Biobase’

The following object is masked from ‘package:MatrixGenerics’:

    rowMedians

The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians
library(edgeR)
Loading required package: limma
package ‘limma’ was built under R version 4.1.3
Attaching package: ‘limma’

The following object is masked from ‘package:BiocGenerics’:

    plotMA
library(EnhancedVolcano)
Loading required package: ggplot2
package ‘ggplot2’ was built under R version 4.1.2Loading required package: ggrepel
Registered S3 methods overwritten by 'ggalt':
  method                  from   
  grid.draw.absoluteGrob  ggplot2
  grobHeight.absoluteGrob ggplot2
  grobWidth.absoluteGrob  ggplot2
  grobX.absoluteGrob      ggplot2
  grobY.absoluteGrob      ggplot2
library(Seurat)
package ‘Seurat’ was built under R version 4.1.2Attaching SeuratObject
Attaching sp

Attaching package: ‘Seurat’

The following object is masked from ‘package:SummarizedExperiment’:

    Assays
library(lumi)
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
No methods found in package ‘RSQLite’ for request: ‘dbListFields’ when loading ‘lumi’
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
✔ purrr   0.3.4     
package ‘tibble’ was built under R version 4.1.2package ‘tidyr’ was built under R version 4.1.2package ‘readr’ was built under R version 4.1.2package ‘dplyr’ was built under R version 4.1.2── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::collapse()   masks IRanges::collapse()
✖ dplyr::combine()    masks lumi::combine(), Biobase::combine(), BiocGenerics::combine()
✖ dplyr::count()      masks matrixStats::count()
✖ dplyr::desc()       masks IRanges::desc()
✖ tidyr::expand()     masks S4Vectors::expand()
✖ dplyr::filter()     masks stats::filter()
✖ dplyr::first()      masks S4Vectors::first()
✖ dplyr::lag()        masks stats::lag()
✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename()     masks S4Vectors::rename()
✖ dplyr::slice()      masks IRanges::slice()
options(ggrepel.max.overlaps = Inf)

#Load TCGA annotations

mrna_query <- GDCquery(project = "TCGA-PRAD",
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification",
                      workflow.type = "STAR - Counts")
--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-PRAD
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
mrna_query$results[[1]]



nrml_meta <- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Solid Tissue Normal")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")

#Process normal prostate counts

full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"



nrml_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
  print(paste0("processing: ",list.files(path = paste0(full_path, f))))
  counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
  
  return(data.frame(f = counts[,4]))
  
  
}))


symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0427d696-a33d-4005-ac1d-c5363754226e/b1becd71-bbed-4a0f-af9c-94b9b2af928b.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]
colnames(nrml_rna) <- list.files(path = full_path)




nrml_rna$Gene <- symbols$X2

write.table(x = nrml_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)

#Change IDS for normals

nrml_rna <- read.table(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", header = T)


nrml_rna <- nrml_rna[!duplicated(nrml_rna$Gene),]


row.names(nrml_rna) <- nrml_rna$Gene 

nrml_rna <- dplyr::select(nrml_rna, -Gene)

colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "^X*", replacement = "")

colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "\\.", replacement = "-")



nrml_rna <-  cpm(nrml_rna, log = T) %>% t() %>% data.frame()#Log normalize
#nrml_rna <- nrml_rna %>% t() %>% data.frame()

nrml_rna <- nrml_rna %>% mutate(id = row.names(.), batch = "Normal") %>% merge(dplyr::select(nrml_meta , c(id, cases.submitter_id) ))
nrml_rna <- nrml_rna %>% mutate(cases = cases.submitter_id, Purity = NA, Grade_Mutational_status =NA)

#Download TCGA PRAD sample list


prad_meta<- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Primary Tumor")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")

#Process PRAD counts

full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"

prad_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
  print(paste0("processing: ",list.files(path = paste0(full_path, f))))
  counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
  
  return(data.frame(f = counts[,4]))
  
  
}))
#prad_rna <- cpm(prad_rna, log = T, prior.count = 1) %>% data.frame()
colnames(prad_rna) <- list.files(path = full_path)

symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0007888f-8d96-4c01-8251-7fef6cc71596/88215dd0-5841-44f1-9393-eefd8238cbb3.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]

prad_rna$Gene <- symbols$X2


#write.table(x = prad_rna, file = "/Users/ds/Desktop/TCGA/TCGA-PRAD-Tumor/prad_rna_cpm.txt", quote = F, sep = "\t", row.names = F, col.names = T)

write.table(x = prad_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)

#Format PRAD count data

prad_rna$Grade_Mutational_status
  [1]  7  6  7  7  7  6  7  7  7  9  7  8  7  7  7  7  7  9  7  6  6  7  7  6  7  7  9  7  7  7  6  6  7  6  6  7  6  7  7  9  9  6  8  9  7  6  9  7  7
 [50]  7  7  7  8  6  9  7  7  7  7  7  7  8  6  7  7  7  6  7  6  7  7  8  6  7  9  7  7  7  6  7  7  6  8  7  7  9  7  7  7  7  7  7  6  7  7  7  8  8
 [99]  6  7  7  8  7  7  7  9  7  9  6  6  9  8  6  7  8  7  6  6  7  8  7  8  9  7  6  9  6  7  7  7  6  7  7  8  7  7  7  7  9  8  7  6  6  9  7  8  7
[148]  8  7  7  7  7  7  6  7  7  7  6  6  7  9  7  7  8  6  6  9  9  6  8  7  7  9  6  7  9  7  9  7  7  9  6  8  9  7  6  7  7  8  7  9  8  9  7  8  7
[197]  8  7  8  8  7  7  7  6  9  6  6  7  6  8  6  7  8  7  7  6  6  9  7  6  7  7  7  7  7  6  7  7  6  7  7  7  6  6  7  7  7  7  7  7  9  6  8  7  7
[246]  7  8  8  7  6  7  8  6  7  8  7  9  6  8  8  8  6  9  9  7  7  7  7  8  7  9  9  7  7  7  7  8  7  7  7  8  7  7  7  7  8  9  7  7  7  7  7  7  6
[295]  9  7  6  7  6  8  9  7  7  6  7  7  7  6  6  7  7  8  6  7  7  8  7  8  9  6  7  7  6  8  7  9  7  7  9  7  7  7  9  7  7 10

#Add PRAD purity from GDC website

anno_prad <- read.csv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_supp.csv", header = T)
anno_prad <- anno_prad %>% dplyr::select(avgRNA_purity, PATIENT_ID, Reviewed_Gleason_sum)


prad_rna <- inner_join(prad_rna, anno_prad, by = c("cases.submitter_id" = "PATIENT_ID")) %>% mutate(Purity = avgRNA_purity * 100, cases = cases.submitter_id, Grade_Mutational_status = Reviewed_Gleason_sum)


anno_prad
NA

#Format mCRPC count data

anno_mcrpc <- read.table("/Users/ds/Desktop/projects/data/anno/202009_deepRNAseq_sample_full.txt", header =T)




anno_mcrpc <- anno_mcrpc %>% dplyr::select(c(sample_id, wgs_id, biopsy_site,tumor_purity_wgs, tumor_purity_rna , tumor_purity_hist, AR))


anno_mcrpc <- anno_mcrpc %>% mutate(cases =  sample_id, Purity = tumor_purity_rna, Grade_Mutational_status =AR) %>% dplyr::select(cases, Purity, Grade_Mutational_status)



mcrpc_rna <- read.table(file = "/Users/ds/Desktop/projects/data/rna/mCRPC_RNA_counts_genename.txt", header = T)
mcrpc_rna <- mcrpc_rna[!duplicated(mcrpc_rna$Genename),]




row.names(mcrpc_rna) <- mcrpc_rna$Genename
mcrpc_rna <- mcrpc_rna[,-1]
mcrpc_rna <- cpm(mcrpc_rna, log = T) %>% t() %>% data.frame()#Log normalize for heatmap
#mcrpc_rna <- mcrpc_rna %>% t() %>% data.frame()#Counts for DEG analysis
row.names(mcrpc_rna) <- gsub("\\.", "-",  x = row.names(mcrpc_rna))



mcrpc_rna <- mcrpc_rna %>% mutate(cases = row.names(.), batch = "mCRPC")

#Add mCRPC purity
mcrpc_rna <- mcrpc_rna %>% merge(anno_mcrpc)

#Merge PRAD and mCRPC datasets (DEG analysis)

#Merge PRAD, Normal, and mCRPC datasets (Heatmap)

query <- intersect(colnames(prad_rna), colnames(mcrpc_rna)) %>% intersect( colnames(nrml_rna))#Get lncRNAs in common + cases + purity

#Combine PRAD and mCRPC - rows are samples and columns are lncRNAs + RNA purity + batch
comb_rna <- rbind( nrml_rna[,query], arrange(prad_rna[,query],Grade_Mutational_status), arrange(mcrpc_rna[,query],Grade_Mutational_status))


comb_rna <- data.frame(comb_rna[,c("batch", "Purity", "cases", "Grade_Mutational_status")], comb_rna[,colnames(comb_rna) %in% c(pca_lncrna, notpca_lncrna)])


comb_rna <- comb_rna %>% mutate(cases = paste0(cases, ":", 1:nrow(comb_rna)))


row.names(comb_rna) <- comb_rna$cases
comb_rna$batch %>% table()
.
 mCRPC Normal   PRAD 
    74     52    336 

#Plotting heatmap of progression for prostate lncRNA

#Plot lncRNAs in TME

length(filt)
[1] 49
---
title: "R Notebook"
output: html_notebook
---

#Load libraries
```{r}


library(TCGAWorkflowData)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(edgeR)
library(EnhancedVolcano)
library(Seurat)
library(lumi)
library(tidyverse)
library(pheatmap)
options(ggrepel.max.overlaps = Inf)

```


#Load TCGA annotations
```{r}
mrna_query <- GDCquery(project = "TCGA-PRAD",
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification",
                      workflow.type = "STAR - Counts")
mrna_query$results[[1]]



nrml_meta <- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Solid Tissue Normal")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")
```

#Process normal prostate counts
```{r}
full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"



nrml_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
  print(paste0("processing: ",list.files(path = paste0(full_path, f))))
  counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
  
  return(data.frame(f = counts[,4]))
  
  
}))


symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0427d696-a33d-4005-ac1d-c5363754226e/b1becd71-bbed-4a0f-af9c-94b9b2af928b.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]
colnames(nrml_rna) <- list.files(path = full_path)




nrml_rna$Gene <- symbols$X2

write.table(x = nrml_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)
```

#Change IDS for normals
```{r}
nrml_rna <- read.table(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", header = T)


nrml_rna <- nrml_rna[!duplicated(nrml_rna$Gene),]


row.names(nrml_rna) <- nrml_rna$Gene 

nrml_rna <- dplyr::select(nrml_rna, -Gene)

colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "^X*", replacement = "")

colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "\\.", replacement = "-")



nrml_rna <-  cpm(nrml_rna, log = T) %>% t() %>% data.frame()#Log normalize
#nrml_rna <- nrml_rna %>% t() %>% data.frame()

nrml_rna <- nrml_rna %>% mutate(id = row.names(.), batch = "Normal") %>% merge(dplyr::select(nrml_meta , c(id, cases.submitter_id) ))
nrml_rna <- nrml_rna %>% mutate(cases = cases.submitter_id, Purity = NA, Grade_Mutational_status =NA)
```



#Download TCGA PRAD sample list
```{r}

prad_meta<- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Primary Tumor")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")
```



#Process PRAD counts
```{r}
full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"

prad_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
  print(paste0("processing: ",list.files(path = paste0(full_path, f))))
  counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
  
  return(data.frame(f = counts[,4]))
  
  
}))
#prad_rna <- cpm(prad_rna, log = T, prior.count = 1) %>% data.frame()
colnames(prad_rna) <- list.files(path = full_path)

symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0007888f-8d96-4c01-8251-7fef6cc71596/88215dd0-5841-44f1-9393-eefd8238cbb3.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]

prad_rna$Gene <- symbols$X2


#write.table(x = prad_rna, file = "/Users/ds/Desktop/TCGA/TCGA-PRAD-Tumor/prad_rna_cpm.txt", quote = F, sep = "\t", row.names = F, col.names = T)

write.table(x = prad_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)
```


#Format PRAD count data
```{r}
#prad_rna <- read.table(file = "/Users/ds/Desktop/TCGA/TCGA-PRAD-Tumor/prad_rna_cpm.txt", header = T)

prad_rna <- read.table(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_rna_counts.txt", header = T)
prad_rna <- prad_rna[!duplicated(prad_rna$Gene),]


row.names(prad_rna) <- prad_rna$Gene 

prad_rna <- dplyr::select(prad_rna, -Gene)

colnames(prad_rna) <- colnames(prad_rna) %>% gsub(pattern = "^X*", replacement = "")

colnames(prad_rna) <- colnames(prad_rna) %>% gsub(pattern = "\\.", replacement = "-")

prad_rna <-  cpm(prad_rna, log = T) %>% t() %>% data.frame()#Log normalize for heatmap
#prad_rna <- prad_rna %>% t() %>% data.frame()#Counts for DEG analysis



prad_rna <- prad_rna %>% mutate(id = row.names(.), batch = "PRAD") %>% merge(dplyr::select(prad_meta , c(id, cases.submitter_id) ))




```



#Add PRAD purity from GDC website
```{r}
anno_prad <- read.csv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_supp.csv", header = T)
anno_prad <- anno_prad %>% dplyr::select(avgRNA_purity, PATIENT_ID, Reviewed_Gleason_sum)


prad_rna <- inner_join(prad_rna, anno_prad, by = c("cases.submitter_id" = "PATIENT_ID")) %>% mutate(Purity = avgRNA_purity * 100, cases = cases.submitter_id, Grade_Mutational_status = Reviewed_Gleason_sum)


anno_prad

```



#Format mCRPC count data
```{r}
anno_mcrpc <- read.table("/Users/ds/Desktop/projects/data/anno/202009_deepRNAseq_sample_full.txt", header =T)




anno_mcrpc <- anno_mcrpc %>% dplyr::select(c(sample_id, wgs_id, biopsy_site,tumor_purity_wgs, tumor_purity_rna , tumor_purity_hist, AR))


anno_mcrpc <- anno_mcrpc %>% mutate(cases =  sample_id, Purity = tumor_purity_rna, Grade_Mutational_status =AR) %>% dplyr::select(cases, Purity, Grade_Mutational_status)



mcrpc_rna <- read.table(file = "/Users/ds/Desktop/projects/data/rna/mCRPC_RNA_counts_genename.txt", header = T)
mcrpc_rna <- mcrpc_rna[!duplicated(mcrpc_rna$Genename),]




row.names(mcrpc_rna) <- mcrpc_rna$Genename
mcrpc_rna <- mcrpc_rna[,-1]
mcrpc_rna <- cpm(mcrpc_rna, log = T) %>% t() %>% data.frame()#Log normalize for heatmap
#mcrpc_rna <- mcrpc_rna %>% t() %>% data.frame()#Counts for DEG analysis
row.names(mcrpc_rna) <- gsub("\\.", "-",  x = row.names(mcrpc_rna))



mcrpc_rna <- mcrpc_rna %>% mutate(cases = row.names(.), batch = "mCRPC")

#Add mCRPC purity
mcrpc_rna <- mcrpc_rna %>% merge(anno_mcrpc)



```

#Merge PRAD and mCRPC datasets (DEG analysis)
```{r}

query <- intersect(colnames(prad_rna), colnames(mcrpc_rna))#Get lncRNAs in common + cases + purity

(length(query)-2)/length( linc$gene.name) #% of lncRNAs common across datasets

#Combine PRAD and mCRPC - rows are samples and columns are lncRNAs + RNA purity
rna <- rbind(prad_rna[,query], mcrpc_rna[,query])


rna

```

#Merge PRAD, Normal, and mCRPC datasets (Heatmap)
```{r}
query <- intersect(colnames(prad_rna), colnames(mcrpc_rna)) %>% intersect( colnames(nrml_rna))#Get lncRNAs in common + cases + purity

#Combine PRAD and mCRPC - rows are samples and columns are lncRNAs + RNA purity + batch
comb_rna <- rbind( nrml_rna[,query], arrange(prad_rna[,query],Grade_Mutational_status), arrange(mcrpc_rna[,query],Grade_Mutational_status))


comb_rna <- data.frame(comb_rna[,c("batch", "Purity", "cases", "Grade_Mutational_status")], comb_rna[,colnames(comb_rna) %in% c(pca_lncrna, notpca_lncrna)])


comb_rna <- comb_rna %>% mutate(cases = paste0(cases, ":", 1:nrow(comb_rna)))


row.names(comb_rna) <- comb_rna$cases
comb_rna$batch %>% table()



```



#Plotting heatmap of progression for prostate lncRNA
```{r}
mat <- comb_rna[,colnames(comb_rna) %in% pca_lncrna]

filt <- t(mat)[apply(t(mat), 1, FUN = function(x){sum(x < 0)}) == ncol(t(mat)),] %>% row.names()

mat <- mat[,!colnames(mat) %in% filt]

anno <- dplyr::select(comb_rna, c(batch, Purity, Grade_Mutational_status))


colnames(anno) <- c("Batch", "RNA.Tumor.Purity", "GS.Mutational.status")
anno <- mutate(anno, GS.Mutational.status = ifelse(GS.Mutational.status == 2, "AR WT", ifelse(GS.Mutational.status == 3, "AR Gain", GS.Mutational.status)))

anno_col <- list(Batch =colorRampPalette(c('darkgreen',"black", "purple"))(3)) #, RNA.Tumor.Purity = colorRampPalette(c("grey", "navy"))(500) )
names(anno_col$Batch) <- anno$Batch %>% unique()


anno
column_ha = HeatmapAnnotation( RNA.Tumor.Purity = anno$RNA.Tumor.Purity, Batch = anno$Batch, col = anno_col , GS.Mutational.status = anno$GS.Mutational.status)

pdf("/Users/ds/Desktop/plot.pdf", width = 10, height = 10)
Heatmap(matrix = t(mat), show_column_names = F, col  = colorRampPalette(c('navy',"white", "red"))(10), top_annotation = column_ha, cluster_columns = F, row_names_gp = gpar(fontsize = 5,col = ifelse(colnames(mat) %in% c(unique(up$Genes)) , "red", ifelse(colnames(mat)  %in% c(unique(down$Genes)), "blue", "black"))))
dev.off()

mat

```



#Plot lncRNAs in TME
```{r}

anno_col <- dplyr::select(comb_rna, c(batch, Purity, Grade_Mutational_status))


colnames(anno_col) <- c("Batch", "RNA.Tumor.Purity", "GS.Mutational.status")
anno_col <- mutate(anno_col, GS.Mutational.status = ifelse(GS.Mutational.status == 2, "AR WT", ifelse(GS.Mutational.status == 3, "AR Gain", GS.Mutational.status)))


mat <- comb_rna[,colnames(comb_rna) %in% notpca_lncrna] %>% t() %>% data.frame() %>% mutate(gene = row.names(.)) %>% inner_join(final_markers) %>% arrange(cluster) 
row.names(mat) <-mat$gene 

anno_row <- mat %>% dplyr::select(cluster)
colnames(anno_row) <- c("Cell.type")

mat <- mat %>% dplyr::select(starts_with(c("TCGA", "DTB", "PR")))







filt <- mat[apply(mat, 1, FUN = function(x){sum(x < 0)}) == ncol(mat),] %>% row.names()
filt
mat <- mat[!rownames(mat) %in% filt,]
anno_row <- anno_row[!rownames(anno_row) %in% filt,, drop = F]


column_ha = HeatmapAnnotation( RNA.Tumor.Purity = anno_col$RNA.Tumor.Purity, Batch = anno_col$Batch, GS.Mutational.status = anno_col$GS.Mutational.status)
row_ha <- rowAnnotation(Cell.type = anno_row$Cell.type)

anno_row
pdf("/Users/ds/Desktop/plot.pdf", width = 10, height = 10)
Heatmap(matrix = mat, show_column_names = F, col  = colorRampPalette(c('navy',"white", "red"))(10), top_annotation = column_ha, cluster_columns = F,left_annotation = row_ha, row_names_gp = gpar(fontsize = 5,col = ifelse(rownames(mat) %in% c(unique(up$Genes)) , "red", ifelse(rownames(mat)  %in% c(unique(down$Genes)), "blue", "black"))), row_split = anno_row$Cell.type)
dev.off()

length(notpca_lncrna)

dim(mat)

length(filt)
mat
dim(mat)


colnames(mat)

```

